import os
import pandas as pd
from sklearn.model_selection import train_test_split

# 设置当前运环境的动作目录
os.chdir(os.path.dirname(os.path.abspath(__file__)))
# 1.读取index文件
labels,paths = [], []
with open("trec06c/full/index") as file:
    # 2.解析index每一行的内容
    for line in file:
        # 3.根据每一行的内容分割得到label和filepath
        label,path = line.strip().split()
        labels.append(label)
        paths.append(path)

# 4.根据filepath  读取邮件内容城content
#==============这里需要切换工作目录，否则容易找不到文件
os.chdir("trec06c/full")
contents = []
for path in paths:
    with open(path,encoding="gbk",errors="ignore") as file:
        content = file.read()
        contents.append(content)

# 5.分割数据集和测试集
x_train,x_test,y_train,y_test = train_test_split(contents,labels,test_size=0.2,random_state=22)

# 6.保存成csv文件，方便后面处理
os.chdir("D:\peixun\workspace\sk-learn\email_check")
train_data = pd.DataFrame()
train_data["content"] = x_train
train_data["label"] = y_train
train_data.to_csv("data/01.原始训练集.csv")

test_data = pd.DataFrame()
test_data["content"] = x_test
test_data["label"] = y_test
test_data.to_csv("data/01.原始测试集.csv")

